Setup

library(limma)
library(biomaRt)
library(edgeR)
library(AnnotationHub)
library(tidyverse)
library(magrittr)
library(pander)
library(RColorBrewer)
library(ggrepel)
library(swfdr)
library(qvalue)
library(scales)
library(here)
library(variancePartition)
library(org.Hs.eg.db)
library(plyr)
library(ggraph)
library(tidygraph)
library(fgsea)
library(pheatmap)
library(RUVSeq)
library(GenomicRanges)
library(cqn)
library(kableExtra)
library(goseq)
library(org.Dr.eg.db)
library(pathview)
theme_set(theme_bw())
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
if (interactive()) setwd(here::here("R"))
# Get GC content and lengths
gcTrans <- url("https://uofabioinformaticshub.github.io/Ensembl_GC/Release96/Danio_rerio.GRCz11.96.rds") %>%
  readRDS()
# Convert to the gene-level with average length, average GC, max length, and GC of longest transcript for all transcripts assigned to a single gene.
# gcGene <- gcTrans %>%
#   split(f = .$gene_id) %>%
#   mclapply(function(x){
#     gr <- reduce(x)
#     df <- DataFrame(
#       gene_id = unique(x$gene_id),
#       gene_symbol = unique(x$gene_symbol),
#       aveLength = round(mean(x$length),0) %>% as.integer(),
#       aveGc = sum(x$length * x$gc) / sum(x$length),
#       maxLength = max(x$length),
#       longestGc = x$gc[which.max(x$length)[[1]]]
#     )
#     mcols(gr) <- df
#     gr
#   }, mc.cores = 4) %>%
#   GRangesList() %>%
#   unlist() %>%
#   na.omit()
# x# This takes a reasonable amount of time
# # Save as .rds file for faster loading
# saveRDS(gcGene, file = "../R/gcGene.rds")
gcGene <- readRDS("../R/gcGene.rds")

Annotation was set up as a EnsDb object based on Ensembl Release 96

ah <- AnnotationHub() %>%
  subset(species == "Danio rerio") %>%
  subset(dataprovider == "Ensembl") %>%
  subset(rdataclass == "EnsDb")
ensDb <- ah[["AH69169"]]
genes <- genes(ensDb)
transGR <- transcripts(ensDb)
mcols(genes) <- mcols(genes)[c("gene_id", "gene_name", 
                                   "gene_biotype", "entrezid","description")] %>%
  as.data.frame() %>%
  dplyr::select(-entrezid) %>%
  left_join(as.data.frame(mcols(gcGene))) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  set_rownames(.$gene_id) %>%
  DataFrame() %>%
  .[names(genes),]
genesGR <- genes(ensDb)
DrEns2Symbol <- genesGR %>%
    mcols() %>%
    as_tibble() %>%
    dplyr::select(gene_id, gene_name)

Prior to count-level analysis, the initial dataset was pre-processed using the following steps:

  • Adapters were removed from any reads
  • Bases were removed from the end of reads when the quality score dipped below 30
  • Reads < 35bp after trimming were discarded

After trimming alignment was performed using STAR v2.5.3a to the Danio rerio genome included in Ensembl Release 96 (GRCz11). Aligned reads were counted for each gene if the following criteria were satisfied:

  • Alignments were unique
  • Alignments strictly overlapped exonic regions
minSamples <- 6
minCpm <- 1
counts <- file.path("../2_alignedData/featureCounts/genes.out") %>%
    read_tsv() %>%
    set_colnames(basename(colnames(.))) %>%
    set_colnames(str_remove(colnames(.), "Aligned.+")) %>%
  gather(key = "Library", value = "Counts", -Geneid) %>%
  dplyr::mutate(Sample = str_remove_all(Library, "_2")) %>%
  group_by(Geneid, Sample) %>%
  dplyr::summarise(Counts = sum(Counts)) %>%
  tidyr::spread(key = "Sample", value = "Counts") %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") 
genes2keep <- cpm(counts) %>%
  is_greater_than(minCpm) %>%
  rowSums() %>%
  is_weakly_greater_than(minSamples)

Counts were then loaded and summed across replicate sequencing runs. Genes were only retained if receiving more than 1 cpm in at least 6 samples. This filtering step discarded 12,661 genes as undetectable and retained 19,396 genes for further analysis.

plotDensities(cpm(counts, log = TRUE), legend = FALSE, main = "a) All genes")
plotDensities(cpm(counts[genes2keep,], log = TRUE), legend = FALSE, main = "b) Retained genes")
Distribution of logCPM values for a) all and b) retained genesDistribution of logCPM values for a) all and b) retained genes

Distribution of logCPM values for a) all and b) retained genes

dgeList <- counts %>%
  .[genes2keep,] %>%
  DGEList(
    samples = tibble(
      sample = colnames(.),
      group = str_replace_all(sample, "[0-9]*[A-Z]([12])Lardelli", "\\1"),
      pair = str_replace_all(sample, "[0-9]*([A-Z])[0-9].+", "\\1")
    ) %>%
      as.data.frame(),
        genes = genes[rownames(.)] %>%
            as.data.frame() %>%
            dplyr::select(
                ensembl_gene_id = gene_id,
                chromosome_name = seqnames,
                description,
                external_gene_name = gene_name,
                aveLength,
                aveGc,
                maxLength,
                longestGc
            )
    ) %>%
    calcNormFactors(method = "TMM") %>%
    estimateDisp()
## Design matrix not provided. Switch to the classic mode.
dgeList$samples$Genotype <- c("Q96K97del/+", "WT")[dgeList$samples$group] %>%
  factor(levels = c("WT", "Q96K97del/+"))

Counts were then formed into a DGEList object. Library sizes after alignment, counting and gene filtering ranged between 44,498,373 and 52,779,608 reads.

Data Inspection

v <- dgeList %>%
    voomWithQualityWeights(design = matrix(1, nrow = ncol(.)))
Sample weights with the ideal equal weight of 1 shown as a horizontal line.

Sample weights with the ideal equal weight of 1 shown as a horizontal line.

pca <- v$E %>%
    t() %>%
    prcomp() 
summary(pca)$importance %>% pander(split.tables = Inf)    
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12
Standard deviation 21.98 17.89 14.06 12.92 11.64 11.14 10.79 10.61 9.846 7.989 7.633 5.872e-14
Proportion of Variance 0.2576 0.1706 0.1055 0.08908 0.07222 0.06621 0.06209 0.05998 0.05169 0.03403 0.03107 0
Cumulative Proportion 0.2576 0.4282 0.5336 0.6227 0.6949 0.7611 0.8232 0.8832 0.9349 0.9689 1 1
PCA of all samples. Point sizes indicate library sizes with the combination of PC1 & PC2 appearing to capture a considerable amount of this variation.

PCA of all samples. Point sizes indicate library sizes with the combination of PC1 & PC2 appearing to capture a considerable amount of this variation.

#DGE Analysis

To run a mixed-effects (nested) analysis we must use a voom object. Here we are looking at the change due to genotype within each pair. We then look at the mean of the differences, instead of the difference in means.

voomData <- voom(dgeList, plot = FALSE)
fm <- ~ Genotype + (1|pair)
contrMat <- getContrast(voomData, fm, voomData$targets, "GenotypeQ96K97del/+")
library(doParallel)
cl <- makeCluster(7)
registerDoParallel(cl)
fit <- dream(voomData, fm, voomData$targets, contrMat) %>% eBayes()
## Projected memory usage: > 506.5 Mb 
## 
## Finished...
## Total: 293 s
# stop cluster
stopCluster(cl)
deGenes <- topTable(fit, n = Inf) %>%
  as_tibble() %>%
  mutate(
    q = qvalue(P.Value)$qvalues,
    Sig = q < 0.05) %>%
  dplyr::select(
    ensembl_gene_id, external_gene_name, logFC, t,
  P.Value, Sig, q, aveGc, aveLength, maxLength, longestGc,AveExpr)
head(deGenes)
## # A tibble: 6 x 12
##   ensembl_gene_id external_gene_n…  logFC     t P.Value Sig        q aveGc
##   <chr>           <chr>             <dbl> <dbl>   <dbl> <lgl>  <dbl> <dbl>
## 1 ENSDARG0000009… CABZ01034698.2    1.54   7.75 4.15e-6 TRUE  0.0215 0.359
## 2 ENSDARG0000009… si:ch211-213a13…  1.02   7.12 1.25e-6 TRUE  0.0155 0.399
## 3 ENSDARG0000007… leng8             0.324  7.38 6.94e-6 TRUE  0.0215 0.431
## 4 ENSDARG0000007… tex11             1.56   6.28 5.75e-6 TRUE  0.0215 0.456
## 5 ENSDARG0000009… si:dkey-9i23.15  -0.641 -6.00 9.84e-6 TRUE  0.0225 0.413
## 6 ENSDARG0000003… fuom             -0.924 -5.95 1.09e-5 TRUE  0.0225 0.342
## # … with 4 more variables: aveLength <int>, maxLength <int>,
## #   longestGc <dbl>, AveExpr <dbl>
deGenesSig <- deGenes %>%
  dplyr::filter(Sig == TRUE)
Volcano plot highlighting DE genes. Genes indicated in red belowd were considered as DE, which receive a q-value < 0.05.

Volcano plot highlighting DE genes. Genes indicated in red belowd were considered as DE, which receive a q-value < 0.05.

logFC plotted against expression level with significant DE genes shown in red.

logFC plotted against expression level with significant DE genes shown in red.

RUV treatment (Remove Unwanted Variation)

Find some ‘unchanged’ genes. Grab the lowest ranked 5000

genes4Control <- deGenes %>%
  arrange(desc(P.Value)) %>%
  dplyr::slice(1:5000) %>%
  .[["ensembl_gene_id"]]
length(genes4Control)
## [1] 5000
k <- 1
ruv <- RUVg(
  dgeList$counts, 
  cIdx = genes4Control,
  k = k, 
  isLog = FALSE,
  round = TRUE
)
dgeList$samples <- cbind(dgeList$samples, ruv$W)
pcaRUV <- ruv$normalizedCounts %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp() 
summary(pcaRUV)$importance %>% pander(split.tables = Inf)    
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12
Standard deviation 17.5 13.57 12.59 11.22 10.81 10.43 10.21 9.442 7.626 7.252 0.3357 4.834e-14
Proportion of Variance 0.2347 0.1411 0.1215 0.09653 0.0896 0.08332 0.07989 0.06831 0.04456 0.0403 9e-05 0
Cumulative Proportion 0.2347 0.3759 0.4974 0.5939 0.6835 0.7669 0.8467 0.9151 0.9596 0.9999 1 1
pcaRUV$x %>%
  cbind(dgeList$samples[rownames(.),]) %>%
  ggplot(aes(PC1, PC2, colour = Genotype)) +
  geom_point() +
  stat_ellipse(aes(fill = Genotype), geom = "polygon", alpha = 0.2) +
  xlab(
    paste0(
      "PC1 (", percent(summary(pcaRUV)$importance[2, "PC1"]), ")"
    )
  ) +
  ylab(
    paste0(
      "PC2 (", percent(summary(pcaRUV)$importance[2, "PC2"]), ")"
    )
  )

Setup a design matrix using each pool as its own intercept, with the genotype as the common difference

expDesign <- model.matrix(~0 + pair + Genotype + W_1, dgeList$samples) %>%
  set_colnames(str_replace(colnames(.), pattern = "Genotype.+", "Mutant"))

Recalculate the dispersions

dgeList %<>%
  estimateGLMCommonDisp(expDesign) %>%
  estimateGLMTagwiseDisp(expDesign)

Now run DE Analysis after RUV

topTable <- dgeList %>%
  glmFit(expDesign) %>%
  glmLRT(coef = "Mutant") %>%
  topTags(n = Inf) %>%
  as.data.frame() %>%
  set_colnames(gsub("ID.", "", colnames(.))) %>%
  as_tibble() %>%
  mutate(
    DE = FDR < 0.01,
    rankstat = -sign(logFC)*log10(PValue)
  ) %>%
  dplyr::select(
    ensembl_gene_id, external_gene_name, logFC, logCPM, LR,
  PValue, FDR, DE,aveGc, aveLength, rankstat
  ) %>%
  arrange(PValue)
topTableDE <- topTable %>%
  dplyr::filter(DE == TRUE)

write.csv(topTableDE,"../R/DEgenes.csv")

Check the MA plot

tested <- c("si:ch211-213a13.2", "si:ch211-11p18.6", "mdh1ab", "CABZ01034698.2")
# topTable %>%
topTable %>%
  ggplot(aes(logCPM, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5) +
  geom_smooth(se = FALSE) +
  geom_hline(yintercept = c(-1, 1)*0.5, linetype = 2) +
  geom_text_repel(
    aes(label = external_gene_name, colour = DE),
    data = . %>%
      dplyr::filter(abs(logFC) > 1.2 & DE | external_gene_name %in% tested),
    show.legend = FALSE
  ) +
  scale_colour_manual(values = c("grey50", "red"))

Check the volcano plot

topTable %>%
  mutate(DE = FDR < 0.01) %>%
  ggplot(aes(logFC, -log10(PValue))) +
  geom_point(aes(colour = DE), alpha = 0.5) +
  geom_vline(xintercept = c(-1, 1)*0.5, linetype = 2) +
  geom_text_repel(
    aes(label = external_gene_name, colour = DE),
    data = . %>%
      dplyr::filter(abs(logFC) > 1.2 & DE | external_gene_name %in% tested),
    show.legend = FALSE
  ) +
  scale_colour_manual(values = c("grey50", "red"))

Compare the two lists

topTable %>%
  dplyr::select(ensembl_gene_id, RUV = logFC, DE) %>%
  left_join(deGenes) %>%
  dplyr::rename(Voom = logFC) %>%
  mutate(
    Status = case_when(
      Sig & DE ~ "Both",
      Sig & !DE ~ "Voom Only",
      !Sig & DE ~ "RUV Only",
      !Sig & !DE ~ "Not DE"
    )
  ) %>%
  ggplot(aes(Voom, RUV,)) +
  geom_point(aes(colour = Status), alpha = 0.5) +
  geom_abline(slope= 1, linetype = 2) +
  geom_vline(xintercept = c(-1, 1)*0.5) +
  geom_hline(yintercept = c(-1, 1)*0.5) +
  scale_colour_manual(values = c("green", "grey50", "red", "blue"))

The 231 genes considered as DE using an FDR of 0.01 and logFC beyond the range \(\pm 0.5\) were then inspected to confirm that the counts support their inclusion as DE.

Here plot CPM for DE genes having a FDR below 0.01 and logFC beyond the range \(\pm 1.2\)

Expression values for each of the potential DE genes using raw CPM values and a log-scale y-axis.

Expression values for each of the potential DE genes using raw CPM values and a log-scale y-axis.

GC content and gene length check

GC content before RUV

deGenes %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = Sig), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) +
  labs(
    x = "GC Content"
  ) +
  scale_color_manual(values = c("grey70", "red"))

deGenes %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, t)) +
  geom_point(aes(colour = Sig), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) +
  labs(
    x = "GC Content",
    y = "Ranking Statistic"
  ) +
  scale_color_manual(values = c("grey70", "red"))

Gene length before RUV

deGenes %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = Sig), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Gene length",
    title = "without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

deGenes %>%
  dplyr::arrange(desc(P.Value)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, t)) +
  geom_point(aes(colour = Sig), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Gene Length",
    y = "Ranking Statistic",
    title = "without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

GC content after RUV

topTable %>%
  dplyr::arrange(desc(PValue)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content"
  ) +
  scale_color_manual(values = c("grey70", "red"))

topTable %>%
  dplyr::arrange(desc(PValue)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveGc, rankstat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  ylim(-10, 10) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "GC Content", 
    y = "Ranking Statistic"
  ) +
  scale_color_manual(values = c("grey70", "red"))

Gene length after RUV

topTable %>%
  dplyr::arrange(desc(PValue)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene length",
    title = "without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

topTable %>%
  dplyr::arrange(desc(PValue)) %>%
  dplyr::filter(!is.na(aveGc)) %>%
  ggplot(aes(aveLength, rankstat)) +
  geom_point(aes(colour = DE), alpha = 0.5, show.legend = FALSE) +
  ylim(-10, 10) +
  geom_smooth(se = FALSE) + 
  labs(
    x = "Gene Length", 
    y = "Ranking Statistic",
    title = "without cqn"
  ) +
  scale_color_manual(values = c("grey70", "red")) +
  scale_x_log10(labels = comma)

Goseq analysis

The first step here is to map the Ensembl zebrafish gene IDs to Human Entrez gene IDs.

# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
  mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids, but only keep the IDs in the voom object
convertHsEG2Dr <- function(ids, df = idConvert){
  dplyr::filter(df, EntrezID %in% ids)$Geneid
}

# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(label = zfID, symbol = zfName) %>%
  na.omit() %>%
  unique()
# Import hallmark human gene genesets and tidy gene set names
# .gmt files downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp 
# http://data.wikipathways.org/20190610/ 
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "HALLMARK_"))
kegg <- gmtPathways("../files/c2.cp.kegg.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "KEGG_"))
wiki <- gmtPathways("../files/wikipathways-20190610-gmt-Homo_sapiens.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "%.+"))
pwf <- topTable %>% 
  dplyr::select(ensembl_gene_id, logFC, DE, aveLength, aveGc) %>%
  mutate(nGC = aveLength*aveGc) %>% 
  distinct(ensembl_gene_id, .keep_all = TRUE) %>% 
  with(
    nullp(
      DEgenes = structure(
        as.integer(DE), names = ensembl_gene_id
      ), 
      genome = "danRer10", 
      id = "ensGene", 
      # bias.data = log(nGC),
      bias.data = aveLength,
      plot.fit = FALSE
    )
  )
Probability weight function for a gene being considered as DE based on the number of GC bases per gene.

Probability weight function for a gene being considered as DE based on the number of GC bases per gene.

Goseq pathway analysis

Hallmark Gene Sets

Mappings from gene to pathway are also required, instead of the previous pathway to gene.

hallmarkByGene <- names(hallmark) %>%
  lapply(function(x){
    tibble(pathway = x, ensembl_gene_id = hallmark[[x]])
  }) %>% 
  bind_rows() %>% 
  dplyr::filter(!is.na(ensembl_gene_id)) %>%
  split(f = .$ensembl_gene_id) %>% 
  lapply(extract2, "pathway") 
hallmarkGoseq <- pwf %>%
    goseq(gene2cat = hallmarkByGene) %>%
    as_tibble %>%
    mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr"))
Most highly ranked hallmark pathways.
Pathway numDEInCat numInCat PValue FDR
G2M_CHECKPOINT 15 212 8.438e-08 4.219e-06
E2F_TARGETS 10 185 0.0001866 0.004666
ESTROGEN_RESPONSE_LATE 8 226 0.01228 0.2046
BILE_ACID_METABOLISM 5 118 0.02236 0.2795
CHOLESTEROL_HOMEOSTASIS 4 86 0.03098 0.2928

KEGG Pathways

The same approach was then applied to the set of KEGG gene sets

keggByGene <- names(kegg) %>%
    lapply(function(x){
        tibble(pathway = x, ensembl_gene_id = kegg[[x]])
    }) %>% 
    bind_rows() %>% 
    split(f = .$ensembl_gene_id) %>% 
    lapply(extract2, "pathway") 
keggGoseq <- pwf %>%
  goseq(gene2cat = keggByGene) %>%
  as_tibble %>%
  mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr"))
Most highly ranked KEGG pathways.
Pathway numDEInCat numInCat PValue FDR
DNA_REPLICATION 7 30 9.749e-09 1.813e-06
CELL_CYCLE 8 116 1.018e-05 0.0009467
FATTY_ACID_METABOLISM 3 46 0.009046 0.4674
RETINOL_METABOLISM 4 90 0.01005 0.4674
BIOSYNTHESIS_OF_UNSATURATED_FATTY_ACIDS 2 21 0.01668 0.5518

wiki pathway

wikiByGene <- names(wiki) %>%
    lapply(function(x){
        tibble(pathway = x, ensembl_gene_id = wiki[[x]])
    }) %>% 
    bind_rows() %>% 
    split(f = .$ensembl_gene_id) %>% 
    lapply(extract2, "pathway") 
wikiGoseq <- pwf %>%
  goseq(gene2cat = wikiByGene) %>%
  as_tibble %>%
  mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr"))
Most highly ranked wiki pathways.
Pathway numDEInCat numInCat PValue FDR
DNA Replication 7 35 9.925e-08 5.201e-05
Retinoblastoma Gene in Cancer 9 82 2.47e-07 6.471e-05
G1 to S cell cycle control 7 60 4.021e-06 0.0007024
Cell Cycle 8 112 2.947e-05 0.003861
Ciliary landscape 9 210 0.0004896 0.05131

Goseq GO analysis

goByGene <- links(org.Dr.egGO2ALLEGS) %>%
  as_tibble() %>%
  left_join(
    links(org.Dr.egENSEMBL2EG)
  ) %>%
  distinct(ensembl_id, go_id) %>%
  dplyr::filter(ensembl_id %in% topTable$ensembl_gene_id) %>%
  split(f = .$ensembl_id) %>%
  lapply(extract2, "go_id")
goGoseq <- pwf %>%
  goseq(gene2cat = goByGene) %>%
  as_tibble %>%
  mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr"))
goGoseq %>%
    dplyr::filter(FDR < 0.05) %>%
    dplyr::select(
        Pathway = category, 
        starts_with("num"), 
        term,
        ontology,
        PValue = over_represented_pvalue, 
        FDR
    ) %>%
    pander(
        caption = "Most highly ranked GO pathways.",
        justify = "lrrrrrrrr"
    )

Quitting from lines 771-795 (PairedAnalysiswithRUV.Rmd) Error in pandoc.table.return(…) : Wrong number of parameters (9 instead of 7) passed: justify Calls: … pander.data.frame -> pandoc.table -> cat -> pandoc.table.return In addition: There were 11 warnings (use warnings() to see them)

goGoseqTop <- goGoseq %>%
    dplyr::filter(FDR < 0.05) %>%
    dplyr::select(
        Pathway = category, 
        starts_with("num"), 
        term,
        ontology,
        PValue = over_represented_pvalue, 
        FDR)
write.csv(goGoseqTop, "../R/GoTerms.csv")

notch signaling genes (Not significantly-changed GO)

vapply(goByGene, function(x){"GO:0007219" %in% x}, logical(1)) %>% which() %>% names() %>% intersect(dplyr::filter(topTable, DE)$ensembl_gene_id)
## [1] "ENSDARG00000004232" "ENSDARG00000026629"

iron ion transport genes (significantly-changed GO)

vapply(goByGene, function(x){"GO:0006826" %in% x}, logical(1)) %>% which() %>% names() %>% intersect(dplyr::filter(topTable, DE)$ensembl_gene_id)
## [1] "ENSDARG00000016771" "ENSDARG00000077360" "ENSDARG00000077372"
## [4] "ENSDARG00000094210"

GO plot

## Get significant GO terms
sigGo <- goGoseq %>%
    dplyr::filter(FDR < 0.05) %>%
    .$category

## Convert list of GO terms by gene to list of genes by GO term
geneByGo <- names(goByGene) %>% 
    lapply(function(x){tibble(gene_id = x, go_id = goByGene[[x]])}) %>% 
    bind_rows() %>% 
    split(f = .$go_id) %>% 
    lapply(magrittr::extract2, "gene_id")

## Get DE genes that belong to sigificant GO terms
goGenes <- lapply(
    sigGo, 
    function(x){
        geneByGo[[x]][geneByGo[[x]] %in% topTableDE$ensembl_gene_id]
    }
) 
names(goGenes) <- sigGo

## Make tibble of GO terms
goTerms <- names(goGenes) %>%
    tibble::enframe(name = NULL, value = "label")
## Make tibble of genes
genes <- unlist(goGenes) %>% 
    unique() %>%
    tibble::enframe(name = NULL, value = "label") %>%
    mutate

## Join to create node list
nodes <- rbind(goTerms, genes) %>%
    rowid_to_column("id")

## Create edge list
edges <- goGenes %>%
    stack() %>%
    as_tibble() %>%
    dplyr::select(goTerm = ind, geneId = values) %>%
    dplyr::arrange(goTerm) %>%
    mutate(goTerm = as.character(goTerm)) %>%
    left_join(nodes, by = c("goTerm" = "label")) %>%
    dplyr::rename(from = id) %>%
    left_join(nodes, by = c("geneId" = "label")) %>%
    dplyr::rename(to = id) %>%
    dplyr::select(from, to)

## Setup colours
colours <- length(sigGo) %>%
    rainbow()

## Create tidygraph object
tidy <- tbl_graph(
    nodes = nodes, 
    edges = edges, 
    directed = FALSE
) %>%
    activate(nodes) %>%
    mutate(
        goTerms = case_when(
            id <= length(sigGo) ~ label
        ),
        term = Term(label),
        gene_id = case_when(
            !label %in% sigGo ~ label
        ),
        colour = case_when(
            id <= length(sigGo) ~ colours[id]
        ),
        size = ifelse(id <= length(sigGo), 4, 1)
    ) %>%
    left_join(DrEns2Symbol) %>%
    activate(edges) %>%
    mutate(
        colour = case_when(
            from <= length(sigGo) ~ colours[from]
        )
    )

## Set seed to allow same graph to be produced each time function is executed
set.seed(1234)

## Plot network graph
tested2 <- c("mcm2", "mcm3", "mcm4", "mcm5", "mcm6", "mcm7", "tfa", "tfr1b", "fthl31", "fthl30")
ggraph(tidy, layout = "fr") +
    scale_fill_manual(
        values = "white", 
        na.value = "gray80"
    ) +
    scale_edge_color_manual(
        values = "black", 
        na.value = "gray80"
    ) +
    geom_edge_arc(
        aes(color = "black"), 
        alpha = 0.5, 
        show.legend = FALSE, 
        curvature = 0.5
    ) +
    geom_node_point(
        aes(fill = "black", size = size),
        shape = 21,
        stroke = 0.5, 
        show.legend = FALSE
    ) +
    geom_node_label(
        aes(label = goTerms),
        repel = TRUE,
        size = 3,
        alpha = 0.7,
        label.padding = 0.1
    ) +
    theme_graph() +
    theme(legend.position = "none")

###Gene Set Enrichment analysis (GSEA)

ranks <- topTable %>%
  mutate(stat = -sign(logFC) * log10(PValue)) %>%
  dplyr::arrange(stat) %>%
  with(structure(stat, names = ensembl_gene_id))

GSEA

Hallmark pathways

set.seed(22)
# Run GSEA for hallmark
fgseaHallmark <- fgsea(hallmark, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)

fgseaHallmarkTop <- fgseaHallmark %>%
  dplyr::filter(padj < 0.05) 

fgseaHallmarkTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 15 most significantly enriched Hallmark pathways. This corresponds to an FDR of 0.291%
pathway pval FDR ES NES size padj
MYOGENESIS 1.169e-05 7.728e-05 -0.5068 -1.781 278 0.0005843
EPITHELIAL_MESENCHYMAL_TRANSITION 1.193e-05 7.728e-05 -0.5189 -1.8 238 0.0005963
MTORC1_SIGNALING 1.198e-05 7.728e-05 -0.5444 -1.885 232 0.0005988
GLYCOLYSIS 1.199e-05 7.728e-05 -0.5276 -1.826 230 0.0005994
ESTROGEN_RESPONSE_LATE 1.202e-05 7.728e-05 -0.5632 -1.946 226 0.0006008
OXIDATIVE_PHOSPHORYLATION 1.206e-05 7.728e-05 -0.5307 -1.829 220 0.0006029
G2M_CHECKPOINT 1.211e-05 7.728e-05 -0.6525 -2.242 212 0.0006057
E2F_TARGETS 1.236e-05 7.728e-05 -0.6436 -2.181 185 0.0006182
XENOBIOTIC_METABOLISM 2.334e-05 0.0001224 -0.4913 -1.728 280 0.001167
FATTY_ACID_METABOLISM 2.449e-05 0.0001224 -0.5435 -1.855 198 0.001224
CHOLESTEROL_HOMEOSTASIS 2.744e-05 0.0001247 -0.6212 -1.924 86 0.001372
ESTROGEN_RESPONSE_EARLY 0.0001084 0.0004516 -0.4879 -1.683 222 0.005419
COAGULATION 0.0002037 0.0007835 -0.5245 -1.742 151 0.01019
MYC_TARGETS_V1 0.0005558 0.001985 -0.4612 -1.588 218 0.02779
COMPLEMENT 0.0008723 0.002908 -0.4488 -1.555 235 0.04361
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  hallmark[dplyr::filter(fgseaHallmark, padj < 0.05)$pathway], ranks, fgseaHallmark, gseaParam = 0.5
)

KEGG pathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for KEGG
fgseaKEGG <- fgsea(kegg, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGTop <- fgseaKEGG %>%
  dplyr::filter(padj < 0.05)
fgseaKEGGTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 24 most significantly enriched KEGG pathways. This corresponds to an FDR of 0.182%
pathway pval FDR ES NES size padj
METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 1.102e-05 0.0001888 -0.6019 -2.181 438 0.00205
DRUG_METABOLISM_CYTOCHROME_P450 1.104e-05 0.0001888 -0.5839 -2.114 431 0.002054
RETINOL_METABOLISM 1.105e-05 0.0001888 -0.517 -1.871 429 0.002055
DRUG_METABOLISM_OTHER_ENZYMES 1.139e-05 0.0001888 -0.544 -1.938 333 0.002119
STEROID_HORMONE_BIOSYNTHESIS 1.146e-05 0.0001888 -0.5685 -2.02 321 0.002131
STARCH_AND_SUCROSE_METABOLISM 1.163e-05 0.0001888 -0.6105 -2.15 285 0.002164
PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 1.164e-05 0.0001888 -0.5904 -2.079 284 0.002165
PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 1.173e-05 0.0001888 -0.6101 -2.139 269 0.002182
ASCORBATE_AND_ALDARATE_METABOLISM 1.175e-05 0.0001888 -0.6125 -2.146 267 0.002185
GLYCOLYSIS_GLUCONEOGENESIS 1.345e-05 0.0001888 -0.6896 -2.178 99 0.002502
ECM_RECEPTOR_INTERACTION 1.36e-05 0.0001888 -0.6702 -2.093 91 0.002529
FATTY_ACID_METABOLISM 1.402e-05 0.0001888 -0.7896 -2.384 71 0.002607
TYROSINE_METABOLISM 1.402e-05 0.0001888 -0.695 -2.098 71 0.002607
GLUTATHIONE_METABOLISM 1.421e-05 0.0001888 -0.8061 -2.396 64 0.002643
BETA_ALANINE_METABOLISM 1.555e-05 0.0001928 -0.8038 -2.068 28 0.002892
BUTANOATE_METABOLISM 2.958e-05 0.0003439 -0.7338 -2.061 45 0.005502
CELL_CYCLE 3.942e-05 0.0004313 -0.5798 -1.87 116 0.007333
NITROGEN_METABOLISM 4.558e-05 0.000471 -0.7684 -2.061 35 0.008478
PYRIMIDINE_METABOLISM 9.477e-05 0.0009277 -0.5889 -1.85 95 0.01763
SULFUR_METABOLISM 0.000135 0.001255 0.5906 1.978 64 0.0251
PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.0001476 0.001307 -0.6879 -1.939 46 0.02745
OXIDATIVE_PHOSPHORYLATION 0.0001551 0.001311 -0.5359 -1.753 131 0.02885
DNA_REPLICATION 0.0002314 0.001822 -0.7475 -1.948 30 0.04303
HEMATOPOIETIC_CELL_LINEAGE 0.000235 0.001822 -0.5375 -1.743 122 0.04372
pv.out <- pathview(gene.data = ranks, 
         pathway.id = "00010", 
         species = "Danio rerio", 
         gene.idtype = "ENSEMBL",
         limit = list(gene=5, cpd=1))
## [1] "Note: 1192 of 19396 unique input IDs unmapped."
# Make a table plot of significant KEGG pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  kegg[fgseaKEGGTop$pathway], ranks, fgseaKEGG, gseaParam = 0.5
)

WikiPathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for WikiPathways
fgseaWiki <- fgsea(wiki, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiTop <- fgseaWiki %>%
  dplyr::filter(padj < 0.05)
fgseaWikiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Wiki pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 10 most significantly enriched Wiki pathways. This corresponds to an FDR of 0.449%
pathway pval FDR ES NES size padj
Metapathway biotransformation Phase I and II 1.066e-05 0.001206 -0.4543 -1.672 585 0.005595
Nuclear Receptors Meta-Pathway 1.07e-05 0.001206 -0.4985 -1.831 564 0.005619
NRF2 pathway 1.148e-05 0.001206 -0.6226 -2.209 317 0.006028
Glucuronidation 1.169e-05 0.001206 -0.6133 -2.154 278 0.006136
Amino Acid metabolism 1.327e-05 0.001206 -0.6151 -1.964 109 0.006969
Retinoblastoma Gene in Cancer 1.379e-05 0.001206 -0.6882 -2.117 82 0.007238
Cell Cycle 3.971e-05 0.002962 -0.5792 -1.856 112 0.02085
Glutathione metabolism 4.514e-05 0.002962 -0.7314 -1.99 38 0.0237
G1 to S cell cycle control 5.727e-05 0.003341 -0.6812 -2.001 60 0.03007
Glycolysis and Gluconeogenesis 8.548e-05 0.004488 -0.6523 -1.926 62 0.04488
# Make a table plot of significant WikiPathways pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  wiki[fgseaWikiTop$pathway], ranks, fgseaWiki, gseaParam = 0.5
)

Gene expression Plot

Expression of *fthl31* across all samples.

Expression of fthl31 across all samples.

Expression of *tfa* across all samples.

Expression of tfa across all samples.

Expression of *tfr1b* across all samples.

Expression of tfr1b across all samples.

IRE enrichment test

ireGR <- list.files(pattern = "gff.gz") %>%
    sapply(rtracklayer::import.gff, simplify = FALSE) %>%
    lapply(function(x){
        tibble(
            tx_id = seqnames(x) %>% as.character(),
            quality = x$quality
        ) %>%
            left_join(
                mcols(transGR) %>% as.data.frame()
            ) %>%
            mutate(
                quality = factor(
                    quality, levels = c("Low", "Medium", "High")
                )
            ) %>%
            arrange(gene_id, desc(quality)) %>%
            distinct(gene_id, .keep_all = TRUE) %>%
            dplyr::select(tx_id, gene_id, quality)
    }) 
names(ireGR) <- str_extract(names(ireGR), "utr[35]")

ireHigh <- lapply(ireGR, subset, quality == "High")
ireByGene <- c(
    names(ireGR) %>%
        lapply(function(x){
            mutate(ireGR[[x]], Type = paste0(x, "_All"))
        }),
    names(ireHigh) %>%
        lapply(function(x){
            mutate(ireHigh[[x]], Type = x)
        })
) %>%
    bind_rows() %>%
    split(f = .$gene_id) %>%
    lapply(function(x){
        unique(x$Type)
    }) 

ireGoseq <- pwf %>%
   goseq(gene2cat = ireByGene) %>%
   as_tibble %>%
   mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr"))
GOseq analysis for IRE enrichment
Pathway numDEInCat numInCat PValue FDR
utr5_All 6 306 0.1834 0.7334
utr5 1 39 0.409 0.8179
utr3 1 72 0.6134 0.8179
utr3_All 9 883 0.9468 0.9468
ireGSEA <- c(
    names(ireGR) %>%
        lapply(function(x){
            mutate(ireGR[[x]], Type = paste0(x, "_All"))
        }),
    names(ireHigh) %>%
        lapply(function(x){
            mutate(ireHigh[[x]], Type = x)
        })
) %>%
    bind_rows() %>%
    split(f = .$Type) %>%
    lapply(function(x){
        unique(x$gene_id)
    }) 

fgseaIRE <- fgsea(ireGSEA, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)

fgseaIRE %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "GSEA analysis of IRE enrichment", percent(max(.$FDR)))
  )
The 4 GSEA analysis of IRE enrichment 65.9%
pathway pval FDR ES NES size padj
utr3 0.1586 0.5451 -0.4057 -1.224 72 0.6346
utr5 0.2726 0.5451 0.3681 1.12 39 1
utr3_All 0.6418 0.6589 -0.2567 -0.9589 883 1
utr5_All 0.6589 0.6589 -0.2621 -0.9276 306 1

Session Info

pander(sessionInfo())

R version 3.6.0 (2019-04-26)

Platform: x86_64-apple-darwin15.6.0 (64-bit)

locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8

attached base packages: splines, stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: pathview(v.1.24.0), org.Dr.eg.db(v.3.8.2), goseq(v.1.36.0), geneLenDataBase(v.1.20.0), BiasedUrn(v.1.07), kableExtra(v.1.1.0), cqn(v.1.30.0), quantreg(v.5.51), SparseM(v.1.77), preprocessCore(v.1.46.0), nor1mix(v.1.3-0), mclust(v.5.4.5), RUVSeq(v.1.18.0), EDASeq(v.2.18.0), ShortRead(v.1.42.0), GenomicAlignments(v.1.20.1), SummarizedExperiment(v.1.14.1), DelayedArray(v.0.10.0), matrixStats(v.0.54.0), Rsamtools(v.2.0.0), GenomicRanges(v.1.36.0), GenomeInfoDb(v.1.20.0), Biostrings(v.2.52.0), XVector(v.0.24.0), BiocParallel(v.1.18.1), pheatmap(v.1.0.12), fgsea(v.1.10.1), Rcpp(v.1.0.2), tidygraph(v.1.1.2), ggraph(v.1.0.2), plyr(v.1.8.4), org.Hs.eg.db(v.3.8.2), AnnotationDbi(v.1.46.1), IRanges(v.2.18.2), S4Vectors(v.0.22.0), variancePartition(v.1.14.0), Biobase(v.2.44.0), foreach(v.1.4.7), here(v.0.1), scales(v.1.0.0), qvalue(v.2.16.0), swfdr(v.1.10.0), ggrepel(v.0.8.1), RColorBrewer(v.1.1-2), pander(v.0.6.3), magrittr(v.1.5), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.2), readr(v.1.3.1), tidyr(v.0.8.3), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.2.1), AnnotationHub(v.2.16.0), BiocFileCache(v.1.8.0), dbplyr(v.1.4.2), BiocGenerics(v.0.30.0), edgeR(v.3.26.7), biomaRt(v.2.40.4) and limma(v.3.40.6)

loaded via a namespace (and not attached): utf8(v.1.1.4), R.utils(v.2.9.0), tidyselect(v.0.2.5), lme4(v.1.1-21), RSQLite(v.2.1.2), grid(v.3.6.0), DESeq(v.1.36.0), munsell(v.0.5.0), codetools(v.0.2-16), withr(v.2.1.2), colorspace(v.1.4-1), highr(v.0.8), knitr(v.1.24), rstudioapi(v.0.10), labeling(v.0.3), KEGGgraph(v.1.44.0), GenomeInfoDbData(v.1.2.1), hwriter(v.1.3.2), polyclip(v.1.10-0), bit64(v.0.9-7), farver(v.1.1.0), rprojroot(v.1.3-2), vctrs(v.0.2.0), generics(v.0.0.2), xfun(v.0.9), R6(v.2.4.0), doParallel(v.1.0.15), locfit(v.1.5-9.1), bitops(v.1.0-6), assertthat(v.0.2.1), promises(v.1.0.1), gtable(v.0.3.0), rlang(v.0.4.0), MatrixModels(v.0.4-1), zeallot(v.0.1.0), genefilter(v.1.66.0), rtracklayer(v.1.44.3), lazyeval(v.0.2.2), broom(v.0.5.2), BiocManager(v.1.30.4), yaml(v.2.2.0), reshape2(v.1.4.3), modelr(v.0.1.5), GenomicFeatures(v.1.36.4), backports(v.1.1.4), httpuv(v.1.5.1), tools(v.3.6.0), gplots(v.3.0.1.1), progress(v.1.2.2), zlibbioc(v.1.30.0), RCurl(v.1.95-4.12), prettyunits(v.1.0.2), viridis(v.0.5.1), haven(v.2.1.1), colorRamps(v.2.3), data.table(v.1.12.2), aroma.light(v.3.14.0), hms(v.0.5.1), mime(v.0.7), evaluate(v.0.14), xtable(v.1.8-4), pbkrtest(v.0.4-7), XML(v.3.98-1.20), readxl(v.1.3.1), gridExtra(v.2.3), compiler(v.3.6.0), KernSmooth(v.2.23-15), crayon(v.1.3.4), minqa(v.1.2.4), R.oo(v.1.22.0), htmltools(v.0.3.6), mgcv(v.1.8-28), later(v.0.8.0), geneplotter(v.1.62.0), lubridate(v.1.7.4), DBI(v.1.0.0), tweenr(v.1.0.1), MASS(v.7.3-51.4), rappdirs(v.0.3.1), boot(v.1.3-23), Matrix(v.1.2-17), cli(v.1.1.0), R.methodsS3(v.1.7.1), gdata(v.2.18.0), igraph(v.1.2.4.1), pkgconfig(v.2.0.2), xml2(v.1.2.2), annotate(v.1.62.0), webshot(v.0.5.1), rvest(v.0.3.4), digest(v.0.6.20), graph(v.1.62.0), rmarkdown(v.1.15), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.0), shiny(v.1.3.2), gtools(v.3.8.1), nloptr(v.1.2.1), nlme(v.3.1-141), jsonlite(v.1.6), fansi(v.0.4.0), viridisLite(v.0.3.0), pillar(v.1.4.2), lattice(v.0.20-38), KEGGREST(v.1.24.1), GO.db(v.3.8.2), httr(v.1.4.1), survival(v.2.44-1.1), interactiveDisplayBase(v.1.22.0), glue(v.1.3.1), png(v.0.1-7), iterators(v.1.0.12), Rgraphviz(v.2.28.0), bit(v.1.1-14), ggforce(v.0.3.1), stringi(v.1.4.3), blob(v.1.2.0), latticeExtra(v.0.6-28), caTools(v.1.17.1.2) and memoise(v.1.1.0)